subdiagnosis <- readr::read_tsv(
file.path("..", "..", "..", "data", "current", params$scpca_project_id, "single_cell_metadata.tsv"),
show_col_types = FALSE
) |>
dplyr::filter(scpca_sample_id == params$sample_id) |>
dplyr::pull(subdiagnosis)This notebook explores using CopyKAT
to estimate tumor and normal cells in SCPCS000205 from SCPCP000006. This
sample has a(n) Favorable subdiagnosis.
CopyKAT was run using the copyKAT.R script
with and without a normal reference.
These results are read into this notebook and used to:
CopyKAT to annotate tumor cells.CopyKAT to cell type
annotations using label transfer and the fetal (kidney) references.# The base path for the OpenScPCA repository, found by its (hidden) .git directory
repository_base <- rprojroot::find_root(rprojroot::is_git_root)
# The current data directory, found within the repository base directory
data_dir <- file.path(repository_base, "data", "current", params$scpca_project_id)
# The path to this module
module_base <- file.path(repository_base, "analyses", "cell-type-wilms-tumor-06")In this notebook, we are working with the Wilms tumor sample defined
in SCPCS000205 from the Wilms tumor dataset SCPCP000006. We work with
the pre-processed and labeled Seurat object that is the
output of
02b_label-transfer_fetal_kidney_reference_Stewart.Rmd saved
in the results directory.
result_dir <- file.path(module_base, "results", params$sample_id)
# copykat results
copykat_output_obj_noref <- file.path(result_dir, "copyKAT", "noref", "05_final-copykat.rds")
copykat_output_obj_ref <- file.path(result_dir, "copyKAT", "noref", "05_final-copykat.rds")
copykat_objs <- c(
no_ref = copykat_output_obj_noref,
with_ref = copykat_output_obj_ref
) |>
purrr::map(readr::read_rds)
# predictions files
predictions_file <- glue::glue("{params$sample_id}_copykat_prediction.txt")
predictions_paths <- c(
no_ref = file.path(result_dir, "copyKAT", "noref", predictions_file),
with_ref = file.path(result_dir, "copyKAT", "ref", predictions_file)
)
# full results gene by cell
full_ck_result_file <- glue::glue("{params$sample_id}_copykat_CNA_results.txt")
full_ck_result_paths <- c(
no_ref = file.path(result_dir, "copyKAT", "noref", full_ck_result_file),
with_ref = file.path(result_dir, "copyKAT", "ref", full_ck_result_file)
)Reports will be saved in the notebook directory. The
pre-processed and annotated Seurat object per samples are
saved in the result folder.
Here we defined function that will be used multiple time all along the notebook.
These functions are taken directly from the supplemental cell type report produced in scpca-nf
vec1 First vectorvec2 Second vectorreturn Jaccard similarity between the vectors
celltype_df The celltype_df data frame which must
contain these columns: colname1, colname2, and
barcodes
colname1 Column name, as a string, of first
categorical variable of interest
colname2 Column name, as a string, of second
categorical variable of interest
return Jaccard similarity matrix for the two columns.
colname1 values will be row names and colname2
values will be column names in the final matrix
make_jaccard_matrix <- function(celltype_df, colname1, colname2) {
# make lists of barcodes for each category, named by the category
id1_list <- split(celltype_df$barcodes, celltype_df[[colname1]])
id2_list <- split(celltype_df$barcodes, celltype_df[[colname2]])
# create the grid of comparisons
cross_df <- tidyr::expand_grid(id1 = names(id1_list), id2 = names(id2_list))
# calculate a single Jaccard index for each combination using split lists & ids
jaccard_scores <- cross_df |>
purrr::pmap_dbl(\(id1, id2){
jaccard(id1_list[[id1]], id2_list[[id2]])
})
# add scores to the comparison grid and convert to matrix
jaccard_matrix <- cross_df |>
dplyr::mutate(jaccard = jaccard_scores) |>
# convert to matrix
tidyr::pivot_wider(
names_from = "id2",
values_from = "jaccard"
) |>
tibble::column_to_rownames(var = "id1") |>
as.matrix()
return(jaccard_matrix)
}make_heatmap_list <- function(jaccard_matrices, column_title, legend_match, cluster_rows = TRUE) {
# Set heatmap padding option
heatmap_padding <- 0.2
ComplexHeatmap::ht_opt(TITLE_PADDING = grid::unit(heatmap_padding, "in"))
# list of heatmaps looking at SingleR/ CellAssign vs tumor/normal
heatmap <- jaccard_matrices |>
purrr::imap(
\(celltype_mat, celltype_method) {
ComplexHeatmap::Heatmap(
t(celltype_mat), # transpose because matrix rows are in common & we want a vertical arrangement
col = circlize::colorRamp2(c(0, 1), colors = c("white", "darkslateblue")),
border = TRUE,
## Row parameters
cluster_rows = cluster_rows,
row_title = celltype_method,
row_title_gp = grid::gpar(fontsize = 12),
row_title_side = "left",
row_names_side = "left",
row_dend_side = "right",
row_names_gp = grid::gpar(fontsize = 10),
## Column parameters
cluster_columns = TRUE,
column_title = column_title,
column_title_gp = grid::gpar(fontsize = 12),
column_names_side = "bottom",
column_names_gp = grid::gpar(fontsize = 10),
column_names_rot = 90,
## Legend parameters
heatmap_legend_param = list(
title = "Jaccard index",
direction = "vertical",
legend_width = unit(1.5, "in")
),
show_heatmap_legend = celltype_method == legend_match,
)
}
) |>
# concatenate vertically into HeatmapList object
purrr::reduce(ComplexHeatmap::`%v%`)
return(heatmap)
}annotation_column will be the columns and will be
compared to each method listed in methods_to_compareplot_jaccard <- function(classification_df,
annotation_column, # single column of annotations to compare to all methods
methods_to_compare, # list of methods to compare to annotations
column_title, # title for columns
legend_match, # what legend to keep, should be a name in `methods_to_compare`
cluster_rows = TRUE # whether or not to cluster the rows of the heatmap
) {
# create jaccard matrices
jaccard_matrices <- methods_to_compare |>
purrr::map(\(name) {
make_jaccard_matrix(
classification_df,
annotation_column,
name
)
})
heatmap <- make_heatmap_list(jaccard_matrices, column_title, legend_match, cluster_rows)
return(heatmap)
}Seurat objectBelow we look at the heatmaps produced by CopyKAT.
Below we prepare and plot a UMAP that shows which cells are
classified as diploid, aneuploid, and not defined by
CopyKAT. We show a side by side UMAP with results from
running CopyKAT both with and without a reference of normal
cells.
# read in ck predictions from both reference types (no_normal and with_normal)
ck_results_df <- predictions_paths |>
purrr::map(readr::read_tsv) |>
dplyr::bind_rows(.id = "reference_used")
# get umap coordinate
umap_df <- srat[["umap"]]@cell.embeddings |>
as.data.frame() |>
rownames_to_column('barcodes')
cnv_df <- umap_df |>
dplyr::left_join(ck_results_df, by = c("barcodes" = "cell.names")) ggplot(cnv_df, aes(x = umap_1, y = umap_2, color = copykat.pred)) +
geom_point(alpha = 0.5, size = 0.5) +
theme_bw() +
facet_wrap(vars(reference_used))To validate some of these annotations, we can also look at some commonly found copy number variations in Wilms tumor patients:
Although these are the most frequent, there are patients who do not have any of these alterations and patients that only have some of these alterations. See Tirode et al., and Crompton et al..
CopyKAT outputs a matrix that contains the estimated
copy numbers for each gene in each cell. We can read that in and look at
the mean estimated copy numbers for each chromosome across each cell. We
might expect that tumor cells would show an increased estimated copy
number in Chr1q, and/or a loss of Chr1p, Chr11p and Chr16q.
# read in full gene by cell copy number detection results
full_ck_results_df <- full_ck_result_paths |>
purrr::map(readr::read_tsv) |>
dplyr::bind_rows(.id = "reference_used")
# for every cell, calculate the mean detection level across all genes in a given chromosome
full_cnv_df <- full_ck_results_df |>
tidyr::pivot_longer(
cols = -c(
reference_used,
chrom
),
names_to = "barcodes",
values_to = "cnv_detection"
) |>
dplyr::group_by(chrom, barcodes, reference_used) |>
dplyr::summarise(mean_cnv_detection = mean(cnv_detection))
# join with cnv info
cnv_df <- cnv_df |>
dplyr::left_join(full_cnv_df, by = c("barcodes", "reference_used")) |>
dplyr::filter(!is.na(chrom))Let’s look at the distribution of CNV estimation in cells that are
called aneuploid and diploid by CopyKAT.
# create faceted density plots showing estimation of CNV detection across each chr of interest
# colored by aneuploid/diploid estimation
ggplot(cnv_df, aes(x = mean_cnv_detection, color = copykat.pred)) +
geom_density() +
theme_bw() +
facet_grid(
rows = vars(chrom),
cols = vars(reference_used)
)# record the versions of the packages used in this analysis and other environment information
sessionInfo()## R version 4.4.1 (2024-06-14)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 22.04.4 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so; LAPACK version 3.10.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8 LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C LC_ADDRESS=C LC_TELEPHONE=C LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Europe/Vienna
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] ggalluvial_0.12.5 org.Hs.eg.db_3.19.1 AnnotationDbi_1.66.0 IRanges_2.38.1 S4Vectors_0.42.1 Biobase_2.64.0 BiocGenerics_0.50.0 clusterProfiler_4.12.6
## [9] enrichplot_1.24.4 msigdbr_7.5.1 patchwork_1.3.0 lubridate_1.9.3 forcats_1.0.0 stringr_1.5.1 dplyr_1.1.4 purrr_1.0.2
## [17] readr_2.1.5 tidyr_1.3.1 tibble_3.2.1 ggplot2_3.5.1 tidyverse_2.0.0 SCpubr_2.0.2 sctransform_0.4.1 Seurat_5.1.0
## [25] SeuratObject_5.0.2 sp_2.1-4 optparse_1.7.5 Matrix_1.7-0
##
## loaded via a namespace (and not attached):
## [1] RcppAnnoy_0.0.22 splines_4.4.1 later_1.3.2 ggplotify_0.1.2 R.oo_1.26.0 polyclip_1.10-7 fastDummies_1.7.4
## [8] lifecycle_1.0.4 httr2_1.0.5 rprojroot_2.0.4 globals_0.16.3 lattice_0.22-6 vroom_1.6.5 MASS_7.3-61
## [15] magrittr_2.0.3 sass_0.4.9 plotly_4.10.4 rmarkdown_2.28 jquerylib_0.1.4 yaml_2.3.10 httpuv_1.6.15
## [22] spam_2.10-0 spatstat.sparse_3.1-0 reticulate_1.39.0 cowplot_1.1.3 pbapply_1.7-2 DBI_1.2.3 RColorBrewer_1.1-3
## [29] abind_1.4-8 zlibbioc_1.50.0 Rtsne_0.17 R.utils_2.12.3 ggraph_2.2.1 yulab.utils_0.1.7 tweenr_2.0.3
## [36] rappdirs_0.3.3 GenomeInfoDbData_1.2.12 ggrepel_0.9.6 irlba_2.3.5.1 listenv_0.9.1 spatstat.utils_3.1-0 tidytree_0.4.6
## [43] goftest_1.2-3 RSpectra_0.16-2 spatstat.random_3.3-2 fitdistrplus_1.2-1 parallelly_1.38.0 leiden_0.4.3.1 codetools_0.2-20
## [50] getopt_1.20.4 DOSE_3.30.5 ggforce_0.4.2 DT_0.33 tidyselect_1.2.1 aplot_0.2.3 UCSC.utils_1.0.0
## [57] farver_2.1.2 viridis_0.6.5 matrixStats_1.4.1 spatstat.explore_3.3-2 jsonlite_1.8.9 tidygraph_1.3.1 progressr_0.14.0
## [64] ggridges_0.5.6 survival_3.7-0 tools_4.4.1 treeio_1.28.0 ica_1.0-3 Rcpp_1.0.13 glue_1.8.0
## [71] gridExtra_2.3 xfun_0.47 qvalue_2.36.0 GenomeInfoDb_1.40.1 withr_3.0.1 fastmap_1.2.0 fansi_1.0.6
## [78] digest_0.6.37 gridGraphics_0.5-1 timechange_0.3.0 R6_2.5.1 mime_0.12 colorspace_2.1-1 scattermore_1.2
## [85] GO.db_3.19.1 tensor_1.5 spatstat.data_3.1-2 RSQLite_2.3.7 R.methodsS3_1.8.2 utf8_1.2.4 generics_0.1.3
## [92] data.table_1.16.0 graphlayouts_1.2.0 httr_1.4.7 htmlwidgets_1.6.4 scatterpie_0.2.4 uwot_0.2.2 pkgconfig_2.0.3
## [99] gtable_0.3.5 blob_1.2.4 lmtest_0.9-40 XVector_0.44.0 shadowtext_0.1.4 htmltools_0.5.8.1 fgsea_1.30.0
## [106] dotCall64_1.1-1 scales_1.3.0 png_0.1-8 spatstat.univar_3.0-1 ggfun_0.1.6 knitr_1.48 rstudioapi_0.16.0
## [113] tzdb_0.4.0 reshape2_1.4.4 nlme_3.1-166 cachem_1.1.0 zoo_1.8-12 KernSmooth_2.23-24 parallel_4.4.1
## [120] miniUI_0.1.1.1 pillar_1.9.0 grid_4.4.1 vctrs_0.6.5 RANN_2.6.2 promises_1.3.0 xtable_1.8-4
## [127] cluster_2.1.6 evaluate_1.0.0 cli_3.6.3 compiler_4.4.1 rlang_1.1.4 crayon_1.5.3 future.apply_1.11.2
## [134] labeling_0.4.3 fs_1.6.4 plyr_1.8.9 stringi_1.8.4 BiocParallel_1.38.0 viridisLite_0.4.2 deldir_2.0-4
## [141] babelgene_22.9 munsell_0.5.1 Biostrings_2.72.1 lazyeval_0.2.2 spatstat.geom_3.3-3 GOSemSim_2.30.2 RcppHNSW_0.6.0
## [148] hms_1.1.3 bit64_4.5.2 future_1.34.0 KEGGREST_1.44.1 shiny_1.9.1 highr_0.11 ROCR_1.0-11
## [155] igraph_2.0.3 memoise_2.0.1 bslib_0.8.0 ggtree_3.12.0 fastmatch_1.1-4 bit_4.5.0 gson_0.1.0
## [162] ape_5.8